# Add this to your config for sparse attention every other layer
{
  "attention_config": [[["local", "global"], "all"]],

  # sparsity config:
  # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for
  # illustrative purposes)
  # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for
  # more detailed config instructions and available parameters

  "sparsity_config": {
    "block": 16, # block size
    "num_local_blocks": 32,
  }
}
